## Read in data
datadir = "/home/shyun/Dropbox/research/usc/ocean-provinces/omd/data"
filenames = c("tabulated_darwin_montly_clim_045_090_ver_0_2.csv",
"tabulated_darwin_montly_clim_090_180_ver_0_2.csv",
"tabulated_darwin_montly_clim_180_360_ver_0_2.csv",
"tabulated_darwin_montly_clim_360_720_ver_0_2.csv",
"tabulated_geospatial_montly_clim_045_090_ver_0_2_5.csv",
"tabulated_geospatial_montly_clim_090_180_ver_0_2_5.csv",
"tabulated_geospatial_montly_clim_180_360_ver_0_2_5.csv",
"tabulated_geospatial_montly_clim_360_720_ver_0_2_5.csv",
"tabulated_geospatial_montly_clim_045_090_ver_0_2.csv",
"tabulated_geospatial_montly_clim_090_180_ver_0_2.csv",
"tabulated_geospatial_montly_clim_180_360_ver_0_2.csv",
"tabulated_geospatial_montly_clim_360_720_ver_0_2.csv")
Earthmover’s distance (EMD) is defined as a function of the optimal flow \(F = [f_ij]\) and the ground distance
\[E M D(P, Q)=\frac{\sum_{i=1}^{m} \sum_{j=1}^{n} d_{i j} f_{i j}}{\sum_{i=1}^{m} \sum_{j=1}^{n} f_{i j}}\]
(Taken directly from Orlova et al. https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0151859)
Figure: Schematic of EMD (https://sbl.inria.fr/doc/group__Earth__mover__distance-package.html).
Here is the new, (yellow) box that we will examine.
dat = read.csv(file.path(datadir, filenames[1]))
mydat = dat[which(dat[,"month"]==1),]
colfun = colorRampPalette(c("blue", "red"))
par(mar=c(0,0,0,0))
map("world", fill=TRUE, col="white", bg="white", ylim=c(-90, 90), mar=c(0,0,0,0))
## plot(newmap)##, xlim = c(-20, 59), ylim = c(35, 71), asp = 1)
centers = sample(mydat$Chl, 5)
points(mydat[ ,c("lon", "lat")], pch=15, cex=4,
col=colfun(20)[as.numeric(cut(mydat$Chl,breaks = 20))])
map("world", fill=TRUE, col="white", bg="white", ylim=c(-90, 90), mar=c(0,0,0,0), add=TRUE)
## New box
lonrange = c(-180, -135)
latrange = c(20, 45)
rect(lonrange[1], latrange[1], lonrange[2], latrange[2],
border = c("yellow"), lwd=5)
First, visualize the January and February chlorophyll data (Darwin).
Values are all normalized to sum to 1.
dat = read.csv(file.path(datadir, filenames[2]))
## Jan
mydat = dat[which(dat[,"month"]==1),]
jan.dat = mydat[which(lonrange[1] < mydat[,"lon"] & mydat[, "lon"] < lonrange[2] &
latrange[1] < mydat[,"lat"] & mydat[, "lat"] < latrange[2]),]
jan.mat = make_mat(jan.dat)
jan.mat = jan.mat/sum(jan.mat)
drawmat_precise2(jan.mat, main="Jan, Darwin")
## Feb
mydat = dat[which(dat[,"month"]==2),]
feb.dat = mydat[which(lonrange[1] < mydat[,"lon"] & mydat[, "lon"] < lonrange[2] &
latrange[1] < mydat[,"lat"] & mydat[, "lat"] < latrange[2]),]
feb.mat = make_mat(feb.dat)
feb.mat = feb.mat/sum(feb.mat)
drawmat_precise2(feb.mat, main="Feb, Darwin")
Now, we calculate optimal transport and make a “vector map” of mass movement.
## Get optimal transport
res <- transport(pgrid(jan.mat), pgrid(feb.mat), p=2,
method="aha")
## Plot both
plot(pgrid(jan.mat), pgrid(feb.mat), res,
## Style options
mass = "thickness", acol = rgb(0,0,1,0.5),
rot = TRUE)
title(main="Jan to Feb, Darwin")
Now, we highlight these arrows for four ranges (0-25%, 25%-50%, 50%-75%, 75%-100%) of the magnitude of the mass transfer.
## Separate them by magnitude of the mass transfer
nbin = 4
## cutoffs = seq(from = min(res$mass), to = max(res$mass), length = nbin+1)
cutoffs = quantile(res$mass)
par(mfrow=c(2,2))
for(ii in 1:nbin){
massrange = cutoffs[ii:(ii+1)]
## cols = rep(rgb(0,0,1,0.2), length(res$mass))
cols = rep(NA, length(res$mass))
cols[which(massrange[1] < res$mass & res$mass < massrange[2])] = rgb(1,0,0, 0.7)
plot(pgrid(jan.mat), pgrid(feb.mat), res,
## Style options
mass="thickness", acol=cols,
rot = TRUE)
title(main = paste("Magnitude", c("Lowest", "Second to lowest",
"Second to highest", "Highest")[ii]),
cex.main = 3)
}
From the optimal transport, we directly know the scalar earthmovers distance, by the formula:
\[E M D(P, Q)=\frac{\sum_{i=1}^{m} \sum_{j=1}^{n} d_{i j} f_{i j}}{\sum_{i=1}^{m} \sum_{j=1}^{n} f_{i j}}\]
all.mats = list()
all.pmats = list()
for(ii in 1:12){
## One month's data
mydat = dat[which(dat[,"month"]==ii),]
one.dat = mydat[which(lonrange[1] < mydat[,"lon"] & mydat[, "lon"] < lonrange[2] &
latrange[1] < mydat[,"lat"] & mydat[, "lat"] < latrange[2]),]
one.mat = make_mat(one.dat)
one.mat = one.mat/sum(one.mat)
all.mats[[ii]] = one.mat
all.pmats[[ii]] = pgrid(one.mat)
}
Now, we obtain all twelve months’ Darwin data and visualize optimal transports in adjacent months (Jan to Feb, Feb to March, and so on). Note, I highlighed the top 10% high-mass transfers in red.
par(mfrow=c(6,2))
for(ii in 1:11){
pgrid1 = pgrid(all.mats[[ii]])
pgrid2 = pgrid(all.mats[[ii+1]])
res <- transport(pgrid1, pgrid2, p=2,
method="aha")
## nbin = 2
## cutoffs = seq(from = min(res$mass), to = max(res$mass), length = nbin+1)
cutoffs = quantile(res$mass, probs = c(0.9,1))
jj = 1
massrange = cutoffs[jj:(jj+1)]
cols = rep(rgb(0,0,1,0.2), length(res$mass))
cols[which(massrange[1] < res$mass & res$mass < massrange[2])] = rgb(1,0,0,0.7)
plot(pgrid1, pgrid2, res,
## Style options
mass="thickness", acol=cols,
## lwd=0.1,
rot=TRUE, length=0.2)
title(main = paste0( month.abb[ii], " to ", month.abb[ii+1]), cex.main=2)
}
Now, let’s form a 24 x 24 distance matrix, which encodes all the pairwise earth mover’s distances between all months in real data and Darwin data.
## DARWIN
all.pmats.darwin = list()
dat = read.csv(file.path(datadir, filenames[2]))
nmonth = 12
for(ii in 1:nmonth){
## One month's data
mydat = dat[which(dat[,"month"]==ii),]
one.dat = mydat[which(lonrange[1] <= mydat[,"lon"] & mydat[, "lon"] <= lonrange[2] &
latrange[1] <= mydat[,"lat"] & mydat[, "lat"] <= latrange[2]),]
one.mat = make_mat(one.dat)
## one.mat = one.mat[-nrow(one.mat),]
one.mat = one.mat/sum(one.mat, na.rm=TRUE)
one.mat[which(is.na(one.mat))] = 0
all.pmats.darwin[[ii]] = pgrid(one.mat)
}
## REAL
dat = read.csv(file.path(datadir, filenames[6])) ## real
all.pmats.real = list()
for(ii in 1:nmonth){
## if(ii==1) jj=2 ## replace this...................
## if(ii==12) jj=11
## if(ii %ni% c(1,12)) jj=ii
## One month's data
mydat = dat[which(dat[,"month"]==ii),]
one.dat = mydat[which(lonrange[1] <= mydat[,"lon"] & mydat[, "lon"] <= lonrange[2] &
latrange[1] <= mydat[,"lat"] & mydat[, "lat"] <= latrange[2]),]
one.mat = make_mat(one.dat)
## if(ii!=12) one.mat = one.mat[-nrow(one.mat),]
one.mat = one.mat/sum(one.mat, na.rm=TRUE)
## one.mat[which(is.na(one.mat))] = 0
all.pmats.real[[ii]] = pgrid(one.mat)
}
all.pmats = c(all.pmats.real, all.pmats.darwin)
nmonth = 24
distmat = matrix(NA, nmonth, nmonth)
## mclapply(1:(nmonth)^2, function(imonth){
## ii = imonth %% 12 + 1
## jj = imonth - 12 * (ii-1)
for(ii in 1:nmonth){
for(jj in 1:nmonth){
if(ii>jj){
res <- transport(all.pmats[[ii]], all.pmats[[jj]], p=2, method="aha")
dist = wasserstein(all.pmats[[ii]], all.pmats[[jj]], tplan=res)
distmat[ii,jj] = dist
distmat[jj,ii] = dist
}
}
}
## }, mc.cores=8)
diag(distmat) = 0
diag(distmat) = NA
colnames(distmat) = rownames(distmat) = c(paste0("r", 1:(nmonth/2)),
paste0("d", 1:(nmonth/2)))
## image(Matrix(distmat))
library(lattice)
drawmat_precise2(distmat, contour=FALSE, main="All months' EMD from Real & Darwin data",
panel = function(...){
panel.levelplot(...)
lattice::panel.abline(v = 12.5, lwd=3)
lattice::panel.abline(h = 12.5, lwd=3)
})
## abline(v=13, lwd=5)
## knitr::kable(as.data.frame(signif(distmat,3)))
We can take a look at certain distances; from left to right in the boxplot.
Compared to the larger white box before we can see that the distances between Darwin data and real data (left-most box) is even more separated from the rest of the boxplots.
diag(distmat) = 0
## Extract certain things
pairdists = sapply(1:12, function(ii){
distmat[ii, ii+12]
})
distmat.real = distmat[1:12, 1:12]
distmat.darwin = distmat[13:24, 13:24]
## Combine them into a list
dists = list(pair= pairdists,
darwin = distmat.darwin[lower.tri(distmat.darwin)],
real = distmat.real[lower.tri(distmat.real)],
darwin_serial = unlist(diag(distmat.darwin[-1,-ncol(distmat.darwin)])),
real_serial = unlist(diag(distmat.real[-1,-ncol(distmat.real)])))
## Clean
dists = lapply(dists, function(a){
if(any(is.na(a))){
return(a[-which(is.na(a))])
} else {
return(a)
}
})
names(dists)=c("darwin-to-real","darwin", "real", "darwin m2m", "real m2m")
boxplot(dists,
col=c(rgb(0,0,1,0.2), rep(rgb(1,0,0,0.2),2), rep(rgb(0,1,0,0.2),2)),
main="Distributions of Earth mover's distances", cex.main=2, cex.lab=2)
From a distance matrix containing all pairwise distances, we can produce more interesting analyses and visualizations.
Hierarchical clustering from the pairwise distances: at the beginning of the process, each element is in a cluster of its own. The clusters are then sequentially combined into larger clusters until all elements end up being in the same cluster.
colnames(distmat) = rownames(distmat) = c(paste0("real-", 1:(nmonth/2)),
paste0("darwin-", 1:(nmonth/2)))
distmat <- as.dist(distmat, diag = TRUE)
labelCol <- function(x) {
if (is.leaf(x)) {
## fetch label
label <- attr(x, "label")
## set label color to red for A and B, to blue otherwise
attr(x, "nodePar") <- list(lab.col=ifelse(sapply(label, substr, 1,4) == "real", "blue", "red"))
}
return(x)
}
## plot(hclust(distmat), axes=FALSE)
hc = hclust(distmat)
d <- dendrapply(as.dendrogram(hc), labelCol)
plot(d, axes=FALSE)
Multidimensional scaling (MDS) is a dimension reduction technique to translate + visualize the level of similarity of \(n\) individual cases of a dataset. MDS translates pairwise ‘distances’ among a set of \(n=24\) objects into a configuration of \(n=24\) points mapped into a lower-dimensional Cartesianspace (e.g. 2D) while preserving the pairwise distances.
## Multidimensional scaling
fit <- cmdscale(distmat, eig=TRUE, k=2) # k is the number of dim
# plot solution
x <- fit$points[,1]
y <- fit$points[,2]
par(pty="s")
plot(x, y, xlab="Coordinate 1", ylab="Coordinate 2",
main="Metric MDS", type="n")
text(x, y, labels = c(paste0(1:(nmonth/2)),
paste0(1:(nmonth/2))),
col = c(rep("blue", nmonth/2), rep("red", nmonth/2)),
cex=2)
legend("topright", fill=c('blue', 'red'), legend = c("real", "Darwin"))
Each data source is closer to each other, more so than the same month from the two data sources.
What to do for optimal transport in the presence of land mass to obstruct movement (water can’t cross a peninsula)? One can simply modify the distances so that, the Euclidean distance is the default, but whenever the arrow meets a land mass, the distance is positive infinity so that it incurs an infinite cost for any mass to cross a landmass. (If it really needs to, it will go around the coast.)
Also, the Wasserstein’s distance metric does not seem to be comparable between different size objects. In other words,